CRIM 515 Final Project In-Class
12 December 2024
Some R Markdown styles to try, if you’re interested:
“almond”, “awsm.css”, “axist”, “bamboo”, “bullframe”, “holiday”, “kacit”, “latex.css”, “markdown-splendor”, “markdown-retro”, “markdown-air”, “markdown-modest”, “marx”, “minicss”, “new.css”, “no-class”, “picocss”, “sakura”, “sakura-vader”, “semantic”, “simplecss”, “style-sans”, “style-serif”, “stylize”, “superstylin”, “tacit”, “vanilla”, “water”, “water-dark”, “writ”
1. Research Question
EXAMPLE: Using 2008-2024 data, this project forecasts animal-related calls for service counts and locations in the City of Fairfax, VA for each month in 2025.
2. Literature Review
INSERT THAT TEXT, WITH CITATION, HERE.
3. Data
Libraries
library(tidyverse)
library(leaflet)
library(sf)
library(tigris)
library(zoo)
library(aTSA)
library(tseries)
library(forecast)Get the latest Fairfax calls for service data:
calls.full <- read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
"1ti7BCMe1vvDVVr75p6lqvgouCy_ae7rn"))Format the date and add month columns:
calls.full$DATE <- as.Date(calls.full$date, format = "%Y-%m-%d")
calls.full$MONTHS <- months(calls.full$DATE)
calls.full$MONTH <- month(calls.full$DATE)Identify a set of calls to analyze, and then filter the data:
group <- c("ANIMAL BITE", "ANIMAL CASE", "ANIMAL CONTROL", "ANIMAL LOST-FOUND")
calls.subset <- calls.full %>% filter(calls.full$type %in% group)Calculate calls per day:
calls <- calls.subset %>%
group_by(DATE) %>%
summarise(CALL.COUNT = n())Fill in blank days:
calls <- calls %>%
complete(DATE = seq.Date(min(DATE), max(DATE), by = "day")) %>%
mutate(WEEKDAY = lubridate::wday(DATE, label = T, week_start = 1),
MONTH = lubridate::month(DATE, label = T, abbr = F),
WEEK = isoweek(DATE),
DAY = day(DATE),
YEAR = year(DATE))
calls <- replace(calls, is.na(calls), 0)Make the data a time series:
cleaned.calls <- zoo(calls$CALL.COUNT,
seq(from = as.Date(min(calls$DATE)),
to = as.Date(max(calls$DATE)), by = 1))Generate basic stats and a basic graph:
summary(cleaned.calls)## Index cleaned.calls
## Min. :2007-01-03 Min. : 0.000
## 1st Qu.:2011-06-18 1st Qu.: 1.000
## Median :2015-12-02 Median : 1.000
## Mean :2015-12-02 Mean : 1.655
## 3rd Qu.:2020-05-17 3rd Qu.: 2.000
## Max. :2024-10-31 Max. :11.000
plot(cleaned.calls)
title("Animal Calls Per Day")Make the data stationary:
stationary1 <- diff(cleaned.calls, differences = 1)Plot each one:
plot(stationary1)Run ADF tests on the original data, and the stationary datasets:
adf.test(as.matrix(stationary1))## Warning in adf.test(as.matrix(stationary1)): p-value smaller than printed
## p-value
##
## Augmented Dickey-Fuller Test
##
## data: as.matrix(stationary1)
## Dickey-Fuller = -30.635, Lag order = 18, p-value = 0.01
## alternative hypothesis: stationary
Stationary1 is the winner.
Determine the p value using PACF:
pacf(stationary1)pacf(stationary1, pl=FALSE)##
## Partial autocorrelations of series 'stationary1', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## -0.490 -0.325 -0.234 -0.208 -0.168 -0.147 -0.126 -0.100 -0.088 -0.086 -0.051
## 12 13 14 15 16 17 18 19 20 21 22
## -0.068 -0.099 -0.060 -0.044 -0.033 -0.056 -0.028 -0.044 -0.082 -0.026 -0.024
## 23 24 25 26 27 28 29 30 31 32 33
## -0.040 -0.056 -0.031 -0.042 -0.069 -0.018 -0.013 -0.019 -0.003 -0.034 0.001
## 34 35 36 37 38
## -0.032 -0.039 -0.004 -0.009 -0.021
The p value for this example will be 13.
Determine the q value using ACF:
acf(stationary1)acf(stationary1, pl=FALSE)##
## Autocorrelations of series 'stationary1', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.490 -0.007 0.005 -0.020 0.014 -0.005 0.003 0.006 -0.005 -0.005
## 11 12 13 14 15 16 17 18 19 20 21
## 0.018 -0.026 -0.010 0.036 -0.009 0.000 -0.018 0.024 -0.020 -0.014 0.048
## 22 23 24 25 26 27 28 29 30 31 32
## -0.024 -0.010 -0.002 0.020 -0.017 -0.011 0.041 -0.020 -0.007 0.010 -0.022
## 33 34 35 36 37 38
## 0.031 -0.034 0.014 0.024 -0.023 -0.005
The q value for this example will be 1.
Build the ARIMA model:
arima.calls <- auto.arima(cleaned.calls, d = 1, max.p = 13, max.q = 1, seasonal = T)
summary(arima.calls)## Series: cleaned.calls
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.0200 -0.9624
## s.e. 0.0129 0.0034
##
## sigma^2 = 2.042: log likelihood = -11563.73
## AIC=23133.45 AICc=23133.46 BIC=23153.8
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.001690847 1.428774 1.119778 -Inf Inf 0.7610518 0.0002094817
Check the model residuals (probably not worth keeping in your Final):
Identify our forecast window:
forecast.window <- as.numeric(as.Date("2025-12-31")-max(calls$DATE))Forecast 2025
forecast.2025 <- forecast(arima.calls, h=forecast.window)
autoplot(forecast.2025)Make a forecast table:
forecast.values <- as.data.frame(forecast.2025$mean)
forecast.values$ID <- seq.int(nrow(forecast.values))
forecast.upper <- as.data.frame(forecast.2025$upper)
forecast.upper$ID <- seq.int(nrow(forecast.upper))
forecast.values <- forecast.values %>%
left_join(forecast.upper, by = 'ID')
colnames(forecast.values) <- c("MEAN", "ID", "CI80", "CI95")
forecast.values$DATE <- as.Date(max(calls$DATE) + forecast.values$ID)
forecast.values$MONTH <- months(forecast.values$DATE)Filter to 2025, summarize by month:
forecast.values.2025 <- subset(forecast.values, forecast.values$DATE > '2024-12-31')
forecast.months <- forecast.values.2025 %>%
group_by(MONTH) %>%
summarise(MEAN = round(sum(MEAN),0), FORECAST.95 = round(sum(CI95),0), FORECAST.80 = round(sum(CI80),0))
forecast.months$DIFF <- forecast.months$FORECAST.95 - forecast.months$FORECAST.80Graphs
- Calls per day with a trend line:
graph.calls <- ggplot(calls, aes(x=DATE, y=CALL.COUNT)) +
geom_line() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
xlab("Years") +
ylab("Call Count") +
ggtitle("TITLE HERE") +
geom_area(fill="lightblue", color="black")
graph.calls +
geom_smooth(method = lm, col = "red", se = FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))## `geom_smooth()` using formula = 'y ~ x'
- Calls per day with a trend line and the forecast:
calls2024 <- calls[c(1,2)]
calls2025 <- forecast.values[c(5,1)]
names(calls2025) <- c("DATE", "CALL.COUNT")
new.calls <- rbind(calls2024, calls2025)
graph.new.calls <- ggplot(new.calls, aes(x=DATE, y=CALL.COUNT)) +
geom_line() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
xlab("Years") +
ylab("Call Count") +
ggtitle("TITLE HERE") +
geom_area(fill="lightblue", color="black")
graph.new.calls +
geom_smooth(method = lm, col = "red", se = FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))## `geom_smooth()` using formula = 'y ~ x'
- Monthly upper bounds graph:
forecast.months$MONTH <- factor(forecast.months$MONTH, levels = forecast.months$MONTH)
forecast.long <- pivot_longer(forecast.months, cols = c(FORECAST.80, DIFF),
names_to = "Category", values_to = "Value")
forecast.long$Category <- gsub("DIFF", "95% Confidence Interval", forecast.long$Category)
forecast.long$Category <- gsub("FORECAST.80", "80% Confidence Interval", forecast.long$Category)
ggplot(forecast.long, aes(x = MONTH, y = Value, fill = fct_rev(Category))) +
geom_bar(stat = "identity") +
geom_text(aes(label = Value), size = 3, colour = 'white', position = position_stack(vjust = 0.5)) +
labs(title = "City of Fairfax 2025 Monthly Forecast Upper Bounds",
x = "Month",
y = "Call Count") +
scale_fill_manual(values = c("95% Confidence Interval" = "gold",
"80% Confidence Interval" = "maroon"),
name = "Forecasts") +
scale_x_discrete(limits = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) +
coord_cartesian(ylim = c(0, 175)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))- Monthly forecasts by mean predicted value:
ggplot(forecast.months, aes(x = MONTH, y = MEAN)) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = c("January", "February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December")) +
coord_cartesian(ylim = c(0, 50)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "City of Fairfax 2025 Mean Monthly Forecast",
x = "Month",
y = "Call Count")Personally, I would use graph #2 AND either graph #3 or graph #4 (so two graphs total).
Maps
Get the geographic data:
fairfax.roads <- roads("VA", "Fairfax city")## Downloading: 16 kB Downloading: 16 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 52 kB Downloading: 68 kB Downloading: 68 kB Downloading: 68 kB Downloading: 68 kB Downloading: 84 kB Downloading: 84 kB Downloading: 84 kB Downloading: 84 kB Downloading: 84 kB Downloading: 84 kB Downloading: 100 kB Downloading: 100 kB Downloading: 100 kB Downloading: 100 kB Downloading: 100 kB Downloading: 100 kB
fairfax.outline <- county_subdivisions("VA", "Fairfax city")## Retrieving data for the year 2021
## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 10% | |======== | 12% | |========= | 13% | |========== | 15% | |=========== | 16% | |============ | 18% | |============= | 19% | |============== | 20% | |=============== | 22% | |================ | 23% | |================= | 25% | |================== | 26% | |=================== | 28% | |==================== | 29% | |====================== | 31% | |======================= | 32% | |======================== | 34% | |========================= | 35% | |========================== | 37% | |=========================== | 38% | |============================ | 39% | |============================ | 40% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |=============================== | 44% | |=============================== | 45% | |================================ | 46% | |================================= | 47% | |================================= | 48% | |================================== | 49% | |=================================== | 49% | |==================================== | 51% | |===================================== | 52% | |====================================== | 54% | |======================================= | 55% | |======================================== | 57% | |========================================= | 58% | |========================================= | 59% | |========================================== | 60% | |=========================================== | 61% | |============================================ | 63% | |============================================= | 64% | |============================================== | 66% | |=============================================== | 67% | |================================================ | 69% | |================================================= | 70% | |================================================== | 72% | |=================================================== | 73% | |==================================================== | 75% | |===================================================== | 76% | |====================================================== | 78% | |======================================================= | 79% | |========================================================= | 81% | |========================================================== | 82% | |=========================================================== | 84% | |============================================================ | 85% | |============================================================= | 87% | |============================================================== | 88% | |=============================================================== | 90% | |================================================================ | 91% | |================================================================= | 93% | |================================================================== | 94% | |=================================================================== | 95% | |==================================================================== | 97% | |===================================================================== | 98% | |======================================================================| 100%
Make the hotspots by month:
ggplot() +
geom_sf(data = fairfax.outline, color = "grey") +
geom_hex(aes(x = lon, y = lat), data = calls.subset, bins = 6) +
scale_fill_continuous(type = "viridis") +
geom_sf(data = fairfax.roads, color = "black", alpha = 0.4) +
theme_classic() +
facet_wrap(~ MONTH)## Warning: Removed 65 rows containing non-finite outside the scale range
## (`stat_binhex()`).
Or, make hotspots by month using only “recent” data (where recent is post-COVID or a similar change to your data):
ggplot() +
geom_sf(data = fairfax.outline, color = "grey") +
geom_hex(aes(x = lon, y = lat), data = subset(calls.subset, calls.subset$year > '2019'),
bins = 6) +
scale_fill_continuous(type = "viridis") +
geom_sf(data = fairfax.roads, color = "black", alpha = 0.4) +
theme_classic() +
facet_wrap(~ MONTH)## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_binhex()`).
Don’t include both sets of these maps - either use the full time range OR the post-COVID one. Not both.